
clear
set mem 500m
capture log close
set more off

global over "\\Client\C$\Users\Gabe\Documents\Stern\Classes\Decision Models\Project"
global data "$over\Data"
global source "$over\Orginal data"
global output "$over\Output files"
global minreview = 50


log using "$data\Dedup Beer list.log", replace

/*
insheet using "$source\ba-beer.csv"

count
duplicates drop
count

save "$data\ba-beer, unique.dta", replace
outsheet using "$data\ba-beer, unique.csv", comma replace

*Pull big data set
insheet using "$source\ba-review-no-text.csv", clear

save "$data\ba-beer full.dta", replace


***give each reviewer only one review of the beer
*use "$data\ba-beer full.dta", clear

gen person_review_count = 1

collapse (mean) appearance aroma palate taste overall (sum) person_review_count, by(name baid  babrewerid abv style profilename) fast

save "$data\ba-beer, one review per person beer.dta", replace

********Categorize beers
use "$data\ba-beer, one review per person beer.dta", clear

gen review_count = 1

*collapse (mean) appearance aroma palate taste overall (sum) review_count, by(name baid brewerid abv style) fast

collapse (mean) appearance aroma palate taste overall (sd) appearance_sd=appearance aroma_sd=aroma palate_sd=palate taste_sd=taste overall_sd=overall (sum) review_count, by(name baid  babrewerid abv style) fast

tostring baid, gen (_baid)
replace _baid = "_" + _baid

gen style_ = subinstr(trim(itrim(style))," ","_",.)

sort _baid

save "$data\ba-beer, beer level summary.dta", replace
outsheet using "$data\ba-beer, beer level summary.csv", comma replace
outsheet using "$output\ba-beer, beer level summary.csv", comma replace

*/

/*
**************Pull in cluster level data
insheet using "$source\Cluster level data.csv", clear

rename attribute cluster

replace cluster = "cluster" + cluster if cluster != "Full Data"

replace count = subinstr(subinstr(count,"(","",1),")","",1)
destring count, replace

sort cluster
save "$data\cluster level data.dta", replace
*/

********************Pull in beer cluster mapping
insheet using "$source\Beer level cluster data.csv", clear

keep _baid cluster

sort _baid

save "$data/Beer level cluster data.dta", replace

merge _baid using "$data\ba-beer, beer level summary.dta", unique
tab _merge
assert _merge == 3
drop _merge

save "$data\beer level summary, with clusters.dta", replace
outsheet using "$output\beer level summary, with clusters.csv", comma replace

*Pick beers to represent clusters

bysort cluster: egen max_rev_count = max(review_count)

drop if review_count < 50 & max_rev_count >= 50

gen neg_overall = overall * -1
gen neg_rev_count = review_count * -1

bysort cluster (neg_overall neg_rev_count overall_sd _baid): gen priority = _n

keep if priority == 1

drop neg_overall neg_rev_count priority

save "$data\Cluster beer representatives.dta", replace

keep cluster _baid name babrewerid abv style_  appearance aroma palate taste overall review_count
order cluster _baid name babrewerid abv style_  appearance aroma palate taste overall review_count

outsheet using "$output\Cluster beer representatives.csv", comma replace

keep cluster _baid

rename _baid _baid_clus_rep

sort cluster
save "$data\Cluster and beer rep only.dta", replace


******Create customer data file
use "$data\ba-beer, one review per person beer.dta", clear

tostring baid, gen (_baid)
replace _baid = "_" + _baid

gen style_ = subinstr(trim(itrim(style))," ","_",.)

preserve
	gen review_count = 1
	
	collapse (mean) overall_mean=overall (median) overall_med=overall (p75) overall_p75=overall (p90) overall_p90=overall (sum) review_count, by( profilename ) fast
	
	sort profilename
	save "$data\profile overall statistics.dta", replace
	outsheet using "$data\profile overall statistics.csv", comma replace
restore

sort _baid
merge _baid using "$data/Beer level cluster data.dta", uniqusing
tab _merge
assert _merge == 3
drop _merge

sort cluster
merge cluster using "$data\Cluster and beer rep only.dta", uniqusing
tab _merge
assert _merge == 3
drop _merge

gen rev_match = _baid == _baid_clus_rep

collapse (mean) overall, by(cluster profilename rev_match) fast

bysort cluster profilename (rev_match): gen count_per = _N
tab count_per rev_match

drop if count_per == 2 & rev_match == 0
drop count_per

bysort cluster profilename (rev_match): gen count_per = _N
tab count_per
assert count_per == 1
drop count_per

drop rev_match

count

replace cluster = subinstr(cluster,"cluster","",1)

rename overall c_

reshape wide c_, i(profilename) j(cluster) string

sort profilename
merge profilename using "$data\profile overall statistics.dta", uniqusing
tab _merge
assert _merge == 3
drop _merge

order profilename overall_mean overall_med overall_p75 overall_p90 review_count

save "$data\Customer cluster preference matrix.dta", replace
outsheet using "$output\Customer cluster preference matrix.csv", comma replace





***End
